In [1]:
import requests  # to make GET request
from bs4 import BeautifulSoup  # to parse the HTML response
import time  # to pause between calls
import pandas as pd  # to see CSV
import os
import re

os.chdir('../data/')

In [2]:
columns=['post id','title','text','href','user id','mother post id']
df = pd.DataFrame(columns=columns)

columns=['user id','user description']
df_users = pd.DataFrame(columns=columns)

# Initialize post index
post_id=0

In [3]:
def add_post(post_id,title,text,url,mother_post_id,user_id,user_name):
    global df,df_users
    #
    # Add post data to dataframe
    #
    newrow={"post id":post_id,
            "title":title,
            "text":text,
            "href":url,
            "user id":user_id,
            "mother post id":mother_post_id}
    df.loc[len(df.values)]=newrow
    # Update user dataframe:
    #
    newrow={"user id":user_id,
          "user description":user_name}
    if user_id not in df_users['user id'].values:
        df_users.loc[len(df_users)]=newrow
    
    
#url="http://ehealthforum.com/health/autism-recovery-success-story-t351300.html"
def parse_post(url):
    global df,df_users,post_id

    user_href_prefix="http://ehealthforum.com/health/user_profile_"
    p = re.compile('\n+')

    response = requests.get(url)
    page_source = response.text
    soup = BeautifulSoup(page_source, 'html5lib')
    mother_post_id=post_id
    post_id=post_id+1

    #
    # Extract data from soup:
    #
    post = soup.find("div", class_="vt_h2")
    title=post.find("h1", class_="caps").text
    #
    # Mother post:
    post1=soup.find("div", class_="vt_first_message_body")
    #
    #
    # User
    #
    user_data = soup.find("span",class_="vt_asked_by_user")
    user_name = user_data.text
    user_id = user_data.find("a")['href'].replace(user_href_prefix,"").replace(".html","")
    user_description = soup.find("span",class_="vt_user_rank").text
    user_name=user_name+" "+user_description    
    #
    # Post text
    #
    text=post1.text.replace("\t","")
    text=p.sub('\n', text)
    #
    # Add post to dataframe
    #
    add_post(post_id,title,text,url,mother_post_id,user_id,user_name)
    #
    # Follow up to that message
    #messages=soup.find_all("div", class_="vt_message_body")
    #for message in messages:
    #    post_id=post_id+1
    #    text=message.text.replace("\t","")
    #    text=p.sub('\n', text)
    postrows=soup.find_all("div",class_="vt_postrow_rest")
    for postrow in postrows[:len(postrows)-1]:
        post_id=post_id+1
        text=postrow.find("div",class_="vt_post_body").text
        text=text.replace("\t","")
        text=p.sub('\n', text)
        user_id=postrow.find("a")['href'].replace(user_href_prefix,"").replace(".html","")
        user_name=postrow.find("div",class_="vt_asked_by_user").text
        user_description=postrow.find("span",class_="vt_user_rank").text
        user_name=user_name+" "+user_description
        add_post(post_id,title,text,url,mother_post_id,user_id,user_name)
for message in messages: post_id=post_id+1 #title=message.find("div",title="subject").text mm=message.text.replace("\t","") mm=p.sub('\n', mm)
url="http://ehealthforum.com/health/autism-recovery-success-story-t351300.html" parse_post(url) df

In [ ]:
df_users


Out[ ]:
user id user description

In [ ]:
url_list=["http://ehealthforum.com/health/autism.html",
          "http://ehealthforum.com/health/autism_medical_questions_242_0_50.html",
          "http://ehealthforum.com/health/autism_medical_questions_242_0_100.html",
          "http://ehealthforum.com/health/autism_medical_questions_242_0_150.html",
          "http://ehealthforum.com/health/autism_medical_questions_242_0_200.html"]

for url in url_list:
    response = requests.get(url)
    page_source = response.text
    soup = BeautifulSoup(page_source, 'html5lib')

    content=soup.find_all("div",class_="fp_topic_content_title")

    for topic in content:
        url=topic.find("a", class_='topictitle')['href']
        print(url)
        parse_post(url)


http://ehealthforum.com/health/possible-autism-signs-t230405.html
http://ehealthforum.com/health/autism-i-think-he-is-on-point-for-yrs-old-t312115.html
http://ehealthforum.com/health/symptoms-of-autism-t200703.html
http://ehealthforum.com/health/autism-recovery-success-story-t351300.html
http://ehealthforum.com/health/increased-intracranial-pressure-associated-with-autism-t512431.html

In [ ]:
df.to_csv('ehealthforum-posts.csv',index=False)
df_users.to_csv('ehealthforum-users.csv',index=False)

In [ ]:
df.tail()